#!/usr/bin/env python3
import json, re, numpy as np, pandas as pd

IN_CSV   = "outputs/lensing_plateau.csv"
OUT_JSON = "outputs/size_regression.json"  # overwrite with robust result

def rg_mid(label: str) -> float:
    s = (label or "").strip().replace("—","-").replace("–","-")
    m = re.match(r"\s*([0-9.]+)\s*-\s*([0-9.]+)\s*", s)
    if not m: return float("nan")
    a,b = float(m.group(1)), float(m.group(2))
    return 0.5*(a+b)

def theilsen_slope(x, y):
    x = np.asarray(x, float); y = np.asarray(y, float)
    n = len(x)
    if n < 2: return float("nan")
    # all pair slopes
    num = y[np.newaxis,:] - y[:,np.newaxis]
    den = x[np.newaxis,:] - x[:,np.newaxis]
    mask = np.triu(np.ones((n,n), bool), 1) & np.isfinite(num) & np.isfinite(den) & (den != 0)
    s = (num/den)[mask]
    return float(np.median(s)) if s.size else float("nan")

def main():
    df = pd.read_csv(IN_CSV)
    ok = df["claimable"].astype(str).str.lower()=="true"
    d  = df.loc[ok, ["Mstar_bin","R_G_bin","A_theta"]].copy()
    if d.empty:
        json.dump({}, open(OUT_JSON,"w"), indent=2); print("no claimables"); return
    d["RG_mid"]  = d["R_G_bin"].apply(rg_mid)
    d["A_theta"] = pd.to_numeric(d["A_theta"], errors="coerce")
    d = d.replace([np.inf,-np.inf], np.nan).dropna(subset=["A_theta","RG_mid"])

    out = {}
    rng = np.random.default_rng(42)
    B   = 5000

    for ms, g in d.groupby("Mstar_bin"):
        x = g["RG_mid"].to_numpy(); y = g["A_theta"].to_numpy()
        n = len(g)
        if n < 3:
            out[ms] = {"n_stacks": int(n), "slope_Atheta_vs_RG": float("nan"),
                       "CI_16": float("nan"), "CI_84": float("nan"), "robust": True}
            continue
        mhat = theilsen_slope(x,y)
        boots = np.empty(B, float)
        for i in range(B):
            idx = rng.integers(0, n, size=n)
            boots[i] = theilsen_slope(x[idx], y[idx])
        boots = boots[np.isfinite(boots)]
        lo, hi = np.percentile(boots, [16,84]) if boots.size else (float("nan"), float("nan"))
        out[ms] = {"n_stacks": int(n), "slope_Atheta_vs_RG": float(mhat),
                   "CI_16": float(lo), "CI_84": float(hi), "robust": True}

    json.dump(out, open(OUT_JSON,"w"), indent=2)
    print(f"Wrote {OUT_JSON} (robust Theil–Sen).")
if __name__ == "__main__":
    main()
